{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['satisfaction_level', 'last_evaluation', 'number_project',\n",
       "       'average_montly_hours', 'time_spend_company', 'Work_accident', 'left',\n",
       "       'promotion_last_5years', 'salary_high', 'salary_low', 'salary_medium',\n",
       "       'sales_IT', 'sales_RandD', 'sales_accounting', 'sales_hr',\n",
       "       'sales_management', 'sales_marketing', 'sales_product_mng',\n",
       "       'sales_sales', 'sales_support', 'sales_technical'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%matplotlib inline\n",
    "\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "hr_data = pd.read_csv('data/hr.csv', header=0)\n",
    "hr_data.head()\n",
    "hr_data = hr_data.dropna()\n",
    "data_trnsf = pd.get_dummies(hr_data, columns =['salary', 'sales'])\n",
    "data_trnsf.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X= data_trnsf.drop('left', axis=1)\n",
    "X.columns\n",
    "Y = data_trnsf.left# feature extraction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of features: 20\n",
      "Reduced number of features: 5\n"
     ]
    }
   ],
   "source": [
    "#Variance Threshold\n",
    "\n",
    "from sklearn.feature_selection import VarianceThreshold\n",
    "# Set threshold to 0.1\n",
    "select_features = VarianceThreshold(threshold = 0.2)\n",
    "select_features.fit_transform(X)\n",
    "\n",
    "# Subset features\n",
    "X_subset = select_features.transform(X)\n",
    "\n",
    "print('Number of features:', X.shape[1])\n",
    "print('Reduced number of features:',X_subset.shape[1])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of features: 20\n",
      "Reduced number of features: 4\n"
     ]
    }
   ],
   "source": [
    "#Chi2 Selector\n",
    "\n",
    "from sklearn.feature_selection import SelectKBest\n",
    "from sklearn.feature_selection import chi2\n",
    "\n",
    "chi2_model = SelectKBest(score_func=chi2, k=4)\n",
    "X_best_feat = chi2_model.fit_transform(X, Y)\n",
    "# selected features\n",
    "print('Number of features:', X.shape[1])\n",
    "print('Reduced number of features:',X_best_feat.shape[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 1  7  9 17  6  1  1  1  3  8 15  2 12  5  4 13 16 14 11 10]\n",
      "[(1, 'Work_accident'), (1, 'promotion_last_5years'), (1, 'salary_high'), (1, 'satisfaction_level'), (2, 'sales_RandD'), (3, 'salary_low'), (4, 'sales_management'), (5, 'sales_hr'), (6, 'time_spend_company'), (7, 'last_evaluation'), (8, 'salary_medium'), (9, 'number_project'), (10, 'sales_technical'), (11, 'sales_support'), (12, 'sales_accounting'), (13, 'sales_marketing'), (14, 'sales_sales'), (15, 'sales_IT'), (16, 'sales_product_mng'), (17, 'average_montly_hours')]\n"
     ]
    }
   ],
   "source": [
    "#Recursive Feature Elimination\n",
    "\n",
    "from sklearn.feature_selection import RFE\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "\n",
    "# create a base classifier used to evaluate a subset of attributes\n",
    "logistic_model = LogisticRegression()\n",
    "# create the RFE model and select 4 attributes\n",
    "rfe = RFE(logistic_model, 4)\n",
    "rfe = rfe.fit(X, Y)\n",
    "# Ranking of the attributes\n",
    "print(sorted(zip(map(lambda x: round(x, 4), rfe.ranking_),X)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[(0.001, 'sales_product_mng'), (0.0012, 'sales_marketing'), (0.0014, 'promotion_last_5years'), (0.0015, 'sales_RandD'), (0.0015, 'sales_accounting'), (0.0017, 'sales_management'), (0.0019, 'sales_hr'), (0.002, 'sales_IT'), (0.0025, 'sales_support'), (0.0037, 'sales_technical'), (0.0038, 'salary_medium'), (0.0039, 'sales_sales'), (0.0071, 'salary_low'), (0.0073, 'salary_high'), (0.0121, 'Work_accident'), (0.1179, 'last_evaluation'), (0.1187, 'average_montly_hours'), (0.1543, 'number_project'), (0.2152, 'time_spend_company'), (0.3413, 'satisfaction_level')]\n"
     ]
    }
   ],
   "source": [
    "# Feature Importance\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "# fit a RandomForest model to the data\n",
    "model = RandomForestClassifier()\n",
    "model.fit(X, Y)\n",
    "# display the relative importance of each attribute\n",
    "print(sorted(zip(map(lambda x: round(x, 4), model.feature_importances_),X)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
